# Import packages
import pandas as pd
import numpy as np
from openai import OpenAI
from k_means_constrained import KMeansConstrained

api_key = "" # Replace with your OpenAI API Key
client = OpenAI(api_key=api_key)

# Load Excel file
file_path = "../Data/NudgeUnits_nopub.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1')
# Preprocess by removing "[REDACTED]" from 'trialtitle'
df['trialtitle'] = df['trialtitle'].str.replace(r'\[REDACTED\]', '', regex=True).str.strip()

# Subset relevant columns
df_subset = df[['id', 'trialtitle']].dropna()

# Function to get embeddings using OpenAI
def get_embeddings(texts, model="text-embedding-ada-002"):
    embeddings = []
    for text in texts:
        response = client.embeddings.create(input=text, model=model)
        embeddings.append(response.data[0].embedding)
    return np.array(embeddings)

# Generate embeddings for the trial titles
embeddings = get_embeddings(df_subset["trialtitle"].tolist())

# Determine the exact number of groups
num_trials = len(df_subset)
group_size = 3
num_groups = num_trials // group_size

# Perform constrained KMeans clustering
clf = KMeansConstrained(
    n_clusters=num_groups,
    size_min=group_size,
    size_max=group_size,
    random_state=0
)

df_subset['group'] = clf.fit_predict(embeddings)

# Sort dataframe by group for clarity
grouped_df = df_subset.sort_values('group')

# Save grouped results to Excel
output_path = "../Data/RCT2scale_grouping2.xlsx"
grouped_df.to_excel(output_path, index=False)

print(f"Grouped file saved as: {output_path}")
